import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook+vscode"
# setup global variables - data file directory and name
DATA_FILES_DIR = 'data'
DATA_FILE_NAME = 'data.csv.gz'
all_data_files = os.listdir(DATA_FILES_DIR)
def create_data_file():
# read each data original data file and concatanate it to single df
os.chdir(DATA_FILES_DIR)
df = pd.concat(map(pd.read_csv, all_data_files), ignore_index=True)
os.chdir('..') # return to previous dir - main dir
# remove some patterns from city column
df['city'] = df['city'].str.replace(',Croatia', '')
df['city'] = df['city'].str.replace(r'+', ' ')
# sort data by datetime and city and save it to .csv file
df = df.sort_values(by=['date_time', 'city'])
df.to_csv(DATA_FILE_NAME, index=False, compression='gzip')
print('Data processed successfully')
# create data file if does not exist
if not os.path.exists(DATA_FILE_NAME):
print('Creating data file')
create_data_file()
else:
print('Data has already been processed')
Data has already been processed
# import data
df_data = pd.read_csv(DATA_FILE_NAME, compression='gzip')
df_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1081584 entries, 0 to 1081583 Data columns (total 37 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 date_time 1081584 non-null object 1 sunrise 1081584 non-null object 2 sunset 1081584 non-null object 3 moonrise 1081584 non-null object 4 moonset 1081584 non-null object 5 moon_phase 1081584 non-null object 6 moon_illumination 1081584 non-null int64 7 time 1081584 non-null int64 8 tempC 1081584 non-null int64 9 tempF 1081584 non-null int64 10 windspeedMiles 1081584 non-null int64 11 windspeedKmph 1081584 non-null int64 12 winddirDegree 1081584 non-null int64 13 winddir16Point 1081584 non-null object 14 weatherCode 1081584 non-null int64 15 weatherIconUrl 1081584 non-null object 16 weatherDesc 1081584 non-null object 17 precipMM 1081584 non-null float64 18 precipInches 1081584 non-null float64 19 humidity 1081584 non-null int64 20 visibility 1081584 non-null int64 21 visibilityMiles 1081584 non-null int64 22 pressure 1081584 non-null int64 23 pressureInches 1081584 non-null int64 24 cloudcover 1081584 non-null int64 25 HeatIndexC 1081584 non-null int64 26 HeatIndexF 1081584 non-null int64 27 DewPointC 1081584 non-null int64 28 DewPointF 1081584 non-null int64 29 WindChillC 1081584 non-null int64 30 WindChillF 1081584 non-null int64 31 WindGustMiles 1081584 non-null int64 32 WindGustKmph 1081584 non-null int64 33 FeelsLikeC 1081584 non-null int64 34 FeelsLikeF 1081584 non-null int64 35 uvIndex 1081584 non-null int64 36 city 1081584 non-null object dtypes: float64(2), int64(25), object(10) memory usage: 305.3+ MB
df_data.head(5)
| date_time | sunrise | sunset | moonrise | moonset | moon_phase | moon_illumination | time | tempC | tempF | ... | DewPointC | DewPointF | WindChillC | WindChillF | WindGustMiles | WindGustKmph | FeelsLikeC | FeelsLikeF | uvIndex | city | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2016-10-31 00:00:00 | 06:40 AM | 04:50 PM | 07:02 AM | 05:38 PM | New Moon | 0 | 0 | 5 | 40 | ... | 2 | 36 | 1 | 34 | 16 | 25 | 1 | 34 | 3 | Bakar |
| 1 | 2016-10-31 00:00:00 | 06:42 AM | 04:54 PM | 07:05 AM | 05:41 PM | New Moon | 0 | 0 | 14 | 58 | ... | 8 | 47 | 12 | 54 | 22 | 35 | 12 | 54 | 4 | Bale |
| 2 | 2016-10-31 00:00:00 | 06:38 AM | 04:50 PM | 07:00 AM | 05:38 PM | New Moon | 0 | 0 | 9 | 48 | ... | 5 | 41 | 6 | 43 | 17 | 28 | 6 | 43 | 3 | Banjol |
| 3 | 2016-10-31 00:00:00 | 06:26 AM | 04:44 PM | 06:48 AM | 05:31 PM | New Moon | 0 | 0 | 14 | 57 | ... | 5 | 41 | 12 | 53 | 20 | 33 | 12 | 53 | 4 | Baska |
| 4 | 2016-10-31 00:00:00 | 06:41 AM | 04:52 PM | 07:04 AM | 05:39 PM | New Moon | 0 | 0 | 5 | 40 | ... | 2 | 36 | 1 | 34 | 16 | 25 | 1 | 34 | 3 | Boljun |
5 rows × 37 columns
# global variables
CORRELATION_DIR = 'correlation_plots'
# recreate directory if does not exist
if not os.path.exists(CORRELATION_DIR):
print(f'Creating folder {CORRELATION_DIR}')
os.mkdir(CORRELATION_DIR)
# to always have the newest plot versions, delete file before creating new one
def remove_file_if_exists(file_path):
if os.path.exists(file_path):
os.remove(file_path)
# function to calculate correlation matrix values
def create_correlation_matrix(data, towns, field):
towns_cnt = len(towns)
# init zero matrix with m=n=count of cities
# set values to -13, just to be sure it is an imposible correlation value
ret_matrix = np.zeros((towns_cnt, towns_cnt)) - 13
# iterate through every city combination and calculate the correlation
# normalize the date for each town
for i, town1 in enumerate(towns):
town1_values = np.array(data.loc[data['city'] == town1][field])
town1_values = (town1_values - np.mean(town1_values)) / (np.std(town1_values) * len(town1_values))
# correlation 1 on diagonal
ret_matrix[i,i] = 1.0
# having in mind that ret_matrix[i,j] == ret_matrix[j,i]
for j, town2 in enumerate(towns[i+1:], i+1):
town2_values = np.array(data.loc[data['city'] == town2][field])
town2_values = (town2_values - np.mean(town2_values)) / (np.std(town2_values))
ret_matrix[i,j] = np.correlate(town1_values, town2_values)[0]
ret_matrix[j,i] = ret_matrix[i,j]
return ret_matrix
unique_towns = sorted(list(df_data['city'].unique()))
CORRELATION_COLUMN = 'tempC' # choose which column will be used for analysis
CORRELATION_DATA_FILENAME = f'{CORRELATION_COLUMN}_correlation_data.npy'
# check if we already have correlation matrix saved
if os.path.exists(CORRELATION_DATA_FILENAME):
print('Correlation file exists!')
corr_matrix = np.load(CORRELATION_DATA_FILENAME)
else:
print('Correlation file does not exist.. Creating one...')
corr_matrix = create_correlation_matrix(data=df_data, towns=unique_towns, field=CORRELATION_COLUMN)
np.save(CORRELATION_DATA_FILENAME, corr_matrix)
Correlation file exists!
# plot correlation matrix
CORRELATION_MATRIX_FILENAME = f'{CORRELATION_DIR}/{CORRELATION_COLUMN}_correlation_matrix.png'
remove_file_if_exists(CORRELATION_MATRIX_FILENAME)
fig = px.imshow(corr_matrix, x=unique_towns, y=unique_towns,
width=1300, height=1300)
fig.update_layout(title_text='Correlation Matrix', title_x=0.5)
fig.write_image(CORRELATION_MATRIX_FILENAME)
fig.show()
# function to plot correlation bar chart
def plot_town_bar_chart(cor, towns, field):
asix_range = np.arange(0, len(towns))
for i, town in enumerate(towns):
CORRELATION_IMAGE_FILENAME = f'{CORRELATION_DIR}/{field}_{town}_correlation_chart.png'
remove_file_if_exists(CORRELATION_IMAGE_FILENAME)
curr_towns = towns.copy()
curr_towns.remove(town)
curr_values = cor[i]
curr_values = np.delete(curr_values, i)
curr_df = pd.DataFrame({'CITY': curr_towns, 'VALUES': curr_values})
fig = px.bar(curr_df, x='CITY', y='VALUES',
hover_name='CITY', width=1000, height=500)
fig.update_layout(title_text=f'Correlation - {town}', title_x=0.5)
fig.update_xaxes(tickangle=90, tickmode='linear', title='')
fig.update_yaxes(title='%', range=[0.5,1.01])
fig.write_image(CORRELATION_IMAGE_FILENAME)
if town == 'Rijeka':
fig.show()
# call function for creating bar charts for each town
plot_town_bar_chart(corr_matrix, unique_towns, CORRELATION_COLUMN)
def create_graph(corr_matrix, unique_towns, unique_towns_index_sorted):
ret_graph = {}
for i in unique_towns_index_sort:
town1 = unique_towns[i]
ret_graph[town1] = []
# having in mind that ret_matrix[i,j] == ret_matrix[j,i]
for j in unique_towns_index_sort[i+1:]:
town2 = unique_towns[j]
if corr_matrix[i][j] < 0.95:
ret_graph[town1].append(town2)
return ret_graph
# plot map with values from SVD_V (towns to concept)
def plot_svd_map(unique_towns, vector, k, data_geo):
CORR_MAP_FILENAME = f'{CORRELATION_DIR}/{CORRELATION_COLUMN}_correlation_map.png'
remove_file_if_exists(CORR_MAP_FILENAME)
data_geo['VALUES'] = vector
px.set_mapbox_access_token(open(".mapbox_token").read())
fig = px.scatter_mapbox(data_geo, lat="LAT", lon="LNG",
color="VALUES", hover_name="CITY",
color_continuous_scale=px.colors.cyclical.Phase)
fig.write_image(CORR_MAP_FILENAME)
fig.show()
# sort indexes of unique_towns based on lng, lat
GEO_POSITION_FILENAME = 'geo_position.csv'
df_geo_position = pd.read_csv(GEO_POSITION_FILENAME)
df_geo_position.sort_values(by=['LNG', 'LAT'], inplace=True)
unique_towns_index_sort = list(df_geo_position.index)
corr_graph = create_graph(corr_matrix, unique_towns, unique_towns_index_sort)
# save notebook before nbconvert
import IPython
%%javascript
IPython.notebook.save_notebook()
# export notebook results to HTML
!jupyter nbconvert --to=HTML correlation.ipynb